{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "0bb1cbb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b95a0fc2",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/var/folders/rr/0mnyyv811fs5vyp22gf4fxk00000gn/T/ipykernel_51944/3873738921.py:8: DtypeWarning: Columns (29) have mixed types. Specify dtype option on import or set low_memory=False.\n",
      "  df = pd.read_csv(filename, usecols=['Plate ID',  'Registration State',\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Plate ID</th>\n",
       "      <th>Registration State</th>\n",
       "      <th>Vehicle Body Type</th>\n",
       "      <th>Vehicle Make</th>\n",
       "      <th>Violation Time</th>\n",
       "      <th>Street Name</th>\n",
       "      <th>Violation Legal Code</th>\n",
       "      <th>Vehicle Color</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>J58JKX</td>\n",
       "      <td>NJ</td>\n",
       "      <td>SDN</td>\n",
       "      <td>HONDA</td>\n",
       "      <td>0523P</td>\n",
       "      <td>43 ST</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>KRE6058</td>\n",
       "      <td>PA</td>\n",
       "      <td>SUBN</td>\n",
       "      <td>ME/BE</td>\n",
       "      <td>0428P</td>\n",
       "      <td>UNION ST</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BLK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>444326R</td>\n",
       "      <td>NJ</td>\n",
       "      <td>SDN</td>\n",
       "      <td>LEXUS</td>\n",
       "      <td>0625A</td>\n",
       "      <td>CLERMONT AVENUE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>BLACK</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>F728330</td>\n",
       "      <td>OH</td>\n",
       "      <td>SDN</td>\n",
       "      <td>CHEVR</td>\n",
       "      <td>1106A</td>\n",
       "      <td>DIVISION AVE</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FMY9090</td>\n",
       "      <td>NY</td>\n",
       "      <td>SUBN</td>\n",
       "      <td>JEEP</td>\n",
       "      <td>1253A</td>\n",
       "      <td>GRAND ST</td>\n",
       "      <td>NaN</td>\n",
       "      <td>GREY</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  Plate ID Registration State Vehicle Body Type Vehicle Make Violation Time  \\\n",
       "0   J58JKX                 NJ               SDN        HONDA          0523P   \n",
       "1  KRE6058                 PA              SUBN        ME/BE          0428P   \n",
       "2  444326R                 NJ               SDN        LEXUS          0625A   \n",
       "3  F728330                 OH               SDN        CHEVR          1106A   \n",
       "4  FMY9090                 NY              SUBN         JEEP          1253A   \n",
       "\n",
       "       Street Name Violation Legal Code Vehicle Color  \n",
       "0            43 ST                  NaN            BK  \n",
       "1         UNION ST                  NaN           BLK  \n",
       "2  CLERMONT AVENUE                  NaN         BLACK  \n",
       "3     DIVISION AVE                  NaN           NaN  \n",
       "4         GRAND ST                  NaN          GREY  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read the NYC parking violations data into memory.  Only\n",
    "# load the following columns:'Plate ID',  'Registration State', \n",
    "# 'Vehicle Make', 'Vehicle Color', 'Vehicle Make', \n",
    "# Violation Time', 'Street Name', 'Violation Legal Code', \n",
    "\n",
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "\n",
    "df = pd.read_csv(filename, usecols=['Plate ID',  'Registration State',\n",
    "                        'Vehicle Make', 'Vehicle Color', 'Vehicle Body Type', \n",
    "                                    'Violation Time', 'Street Name',\n",
    "                                   'Violation Legal Code'])\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "241f56b6",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "Without calculating: Of the columns we loaded, which would make less sense to turn into categories?\n",
    "\n",
    "Once you've thought about it, calculate how many repeated values there are in each column, and determine (more formally) which would give the biggest ROI in using categories."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "364606a9",
   "metadata": {},
   "source": [
    "Columns with repeated values make more sense. Thus, registration state, body type, vehicle make, street name, and vehicle color would all seem to be good candidates.  However, plate ID wouldn't make sense, unless the same cars received a very large number of tickets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "11b40ded",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Violation Legal Code    2.226554e+08\n",
       "Registration State      1.837608e+07\n",
       "Vehicle Body Type       7.619201e+05\n",
       "Violation Time          6.835589e+05\n",
       "Vehicle Color           6.383835e+05\n",
       "Vehicle Make            2.386433e+05\n",
       "Street Name             2.163256e+04\n",
       "Plate ID                3.850149e+02\n",
       "dtype: float64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# This code shows, in descending order, the columns most likely to benefit from being made into categories\n",
    "(df.count() / df.nunique()).sort_values(ascending=False) * 100 "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8da75683",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "In Exercise 25, we saw that the vehicle makes and colors were far from standardized, with numerous misspellings and variations. If we were to standardize the spellings before creating categories, would that make any effect on the memory savings we gain from categorization? Why or why not?"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "dc5120cd",
   "metadata": {},
   "source": [
    "We would benefit most by first standardizing the spellings, and only after creating the category. The more times a string is repeated, the greater the benefit from using a category for that string. When we standardize the spellings, we reduce the number of different strings in a column, and increase the number of times it repeats -- thus strengthening the argument for using a category."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88e58090",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "Read only the first 100,000 lines from the CSV file, but all columns. Show the 10 columns that will most likely benefit greatest from using categories?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "8caaecbe",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Violation Description                5615.000000\n",
       "Violation Legal Code                 5615.000000\n",
       "Law Section                          3333.333333\n",
       "Unregistered Vehicle?                2169.000000\n",
       "Violation County                     1086.333333\n",
       "Issuing Agency                        909.090909\n",
       "Feet From Curb                        833.333333\n",
       "Violation In Front Of Or Opposite     796.800000\n",
       "Date First Observed                   400.000000\n",
       "Plate Type                            344.827586\n",
       "dtype: float64"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "filename = '../data/nyc-parking-violations-2020.csv'\n",
    "\n",
    "df = pd.read_csv(filename, nrows=10_000)\n",
    "(df.count() / df.nunique()).sort_values(ascending=False).head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee4c9260",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
